1 Settings

library("tximport")
library("DESeq2")
library("org.Hs.eg.db")
library("ggplot2")
library("ggrepel")
library("reshape2")
library("gridExtra")
library("ComplexHeatmap")
library("clusterProfiler")
library("circlize")
library("ggpubr")
# define path
pathToData <- "./"

# define cutoffs for DEA
qValue <- 0.01
logFC <- 1

# define size gene sets
min_gs_size <- 10
max_gs_size <- 250

# ensembl transcripts to gene table
ensembl <- read.table(paste0(pathToData, "ensembl/ensembl_genes_transcripts_v100.txt"), header = T, sep = "\t", stringsAsFactors = F)
tx2gene <- ensembl[,c(4,2)]

2 PCA

# sampleTable
sampleTable <- read.table(paste0(pathToData, "metadata_RNAseq.tsv"), sep = "\t", header = T, stringsAsFactors = F)
sampleTable$fileName <- paste0(pathToData, "kallisto/", sampleTable$Sample, "_abundance.tsv")

# Import using tximport package
txi <- tximport(files = sampleTable$fileName, type = "kallisto", txIn = TRUE, txOut = FALSE, tx2gene = tx2gene)

# DESeq
sampleTable$Case <- factor(as.character(sampleTable$Case), levels = unique(sampleTable$Case))
sampleTable$Diagnosis <- factor(sampleTable$Diagnosis, levels = c("CLL", "RT"))
dds_CLLRT <- DESeqDataSetFromTximport(txi = txi, colData = sampleTable, design = ~ Case + Diagnosis)
keep <- rowSums(counts(dds_CLLRT)) >= 10
dds_CLLRT <- dds_CLLRT[keep,]
dds_CLLRT <- DESeq(dds_CLLRT)

vsd <- vst(dds_CLLRT, blind=FALSE)
vsd_CLLRT <- assay(vsd)
colnames(vsd_CLLRT) <- colData(dds_CLLRT)[,c("Sample")]

# PCA
pca <- prcomp(t(vsd_CLLRT), scale=T)
x <- summary(pca)
pcaTable <- cbind(sampleTable, pca$x[,1:6])

p12 <- ggplot(pcaTable, aes(x = PC1, y = PC2, fill=Diagnosis, label=Case)) +
  geom_point(aes(color=IGHV), size=3, pch=21) +
  scale_color_manual(values=c("#010101", "#B4B4B4")) + 
  scale_fill_manual(values=c("#EDEDED", "#CE899A"))+
  geom_text_repel(size=3.25) +
  theme_classic() + theme(axis.ticks = element_blank(), axis.text=element_blank()) + 
  xlab(paste0("PC1 (", round(x$importance[2,1]*100,1), "%)")) +
  ylab(paste0("PC2 (", round(x$importance[2,2]*100,1), "%)"))

p13 <- ggplot(pcaTable, aes(x = PC1, y = PC3, fill=Diagnosis, label=Case)) +
  geom_point(aes(color=IGHV), size=3, pch=21) +
  scale_color_manual(values=c("#010101", "#B4B4B4")) + 
  scale_fill_manual(values=c("#EDEDED", "#CE899A"))+
  geom_text_repel(size=3.25) +
  theme_classic() + theme(axis.ticks = element_blank(), axis.text=element_blank()) + 
  xlab(paste0("PC1 (", round(x$importance[2,1]*100,1), "%)")) +
  ylab(paste0("PC3 (", round(x$importance[2,3]*100,1), "%)")) 

p14 <- ggplot(pcaTable, aes(x = PC1, y = PC4, fill=Diagnosis, label=Case)) +
  geom_point(aes(color=IGHV), size=3, pch=21) +
  scale_color_manual(values=c("#010101", "#B4B4B4")) + 
  scale_fill_manual(values=c("#EDEDED", "#CE899A"))+
  geom_text_repel(size=3.25) +
  theme_classic() + theme(axis.ticks = element_blank(), axis.text=element_blank()) + 
  xlab(paste0("PC1 (", round(x$importance[2,1]*100,1), "%)")) +
  ylab(paste0("PC4 (", round(x$importance[2,4]*100,1), "%)")) 

pdf("outputs/RT_PCA.pdf", width = 12, height = 2.75, useDingbats = F)
grid.arrange(p12, p13, p14, ncol=3)
dev.off()
## quartz_off_screen 
##                 2
pdf("outputs/RT_PCA_12.pdf", width = 3.54, height = 2.75, useDingbats = F)
p12
dev.off()
## quartz_off_screen 
##                 2
grid.arrange(p12, p13, p14, ncol=3)

3 DEA

  • 4 CLL vs 4 RT (paired)
  • Cases 19 and 3299 excluded due to their intermediate gene expression profile

3.1 DEA code

# sampleTable
sampleTable2 <- sampleTable[! sampleTable$Case %in% c(19, 3299),]
sampleTable2$Case <- factor(as.character(sampleTable2$Case), levels=c("63", "365", "12", "816"))

# Import using tximport package
txi2 <- tximport(files = sampleTable2$fileName, type = "kallisto", txIn = TRUE, txOut = FALSE, tx2gene = tx2gene)

# DESeq
dds_CLLRT2 <- DESeqDataSetFromTximport(txi = txi2, colData = sampleTable2, design = ~ Case + Diagnosis)
keep <- rowSums(counts(dds_CLLRT2)) >= 10
dds_CLLRT2 <- dds_CLLRT2[keep,]
dds_CLLRT2 <- DESeq(dds_CLLRT2)
res_CLLRT2 <- results(dds_CLLRT2, alpha = qValue)
resLFC_CLLRT2 <- lfcShrink(dds_CLLRT2, coef="Diagnosis_RT_vs_CLL", res = res_CLLRT2, type="apeglm")

# # QC plots
# plotDispEsts(dds_CLLRT2, ylim = c(1e-6, 1e1) )
# DESeq2::plotMA(res_CLLRT2, main="res")
# DESeq2::plotMA(resLFC_CLLRT2, main="resLFC")
# hist(resLFC_CLLRT2$pvalue, breaks=20, col="grey")
# ## The ratio of small p values for genes binned by mean normalized count
# qs <- c(0, quantile(res_CLLRT2$baseMean[res_CLLRT2$baseMean > 0], 0:6/6))
# bins <- cut(res_CLLRT2$baseMean, qs)
# levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
# fractionSig <- tapply(res_CLLRT2$pvalue, bins, function(p) mean(p < .01, na.rm = TRUE))
# barplot(fractionSig, xlab = "mean normalized count", ylab = "fraction of p values < 0.01")

# # QC numbers
# length(resLFC_CLLRT2$baseMean[is.na(resLFC_CLLRT2$baseMean)]) ## zero counts
# length(resLFC_CLLRT2$pvalue[is.na(resLFC_CLLRT2$pvalue)]) ## outliers
# length(resLFC_CLLRT2$padj[is.na(resLFC_CLLRT2$padj)]) ## automatic independent filtering
# dim(resLFC_CLLRT2)

# Add gene names
resLFC_CLLRT2$EnsemblGeneStableID <- rownames(resLFC_CLLRT2)
resLFC_CLLRT2$HGNC.symbol <- ensembl$HGNC.symbol[match(resLFC_CLLRT2$EnsemblGeneStableID, ensembl$Gene.stable.ID.version)]

# Add direction (up/down/NS)
resLFC_CLLRT2$Direction <- "NS"
resLFC_CLLRT2$Direction[resLFC_CLLRT2$padj < qValue & resLFC_CLLRT2$log2FoldChange > logFC] <- "Up"
resLFC_CLLRT2$Direction[resLFC_CLLRT2$padj < qValue & resLFC_CLLRT2$log2FoldChange < -logFC] <- "Down"
resLFC_CLLRT2$Direction <- factor(resLFC_CLLRT2$Direction, levels = c("NS", "Down", "Up"))

# Keep only genes with HGNC.symbol and remove duplicated HGNC.symbol
resLFC_CLLRT2_db <- data.frame(resLFC_CLLRT2)
resLFC_CLLRT2_db <- resLFC_CLLRT2_db[order(resLFC_CLLRT2_db$padj), ]
resLFC_CLLRT2_db <- resLFC_CLLRT2_db[resLFC_CLLRT2_db$HGNC.symbol != "", ]
resLFC_CLLRT2_db <- resLFC_CLLRT2_db[!duplicated(resLFC_CLLRT2_db$HGNC.symbol), ]
table(resLFC_CLLRT2_db$Direction)
## 
##    NS  Down    Up 
## 17306   809  1439
# Write table DEA
write.table(resLFC_CLLRT2_db[,c(6,7,1:5,8)], "outputs/RT_DEA.tsv", col.names = T, row.names = F, sep = "\t", quote = F)

DT::datatable(resLFC_CLLRT2_db[,c(6,7,1:5,8)], options = list(scrollX = T, scrollY=T), rownames = F)

3.2 Volcano plot

# Volcano plot
vp_CLLRT <- ggplot(resLFC_CLLRT2_db[!is.na(resLFC_CLLRT2_db$padj),], aes(log2FoldChange, -log10(pvalue), color=Direction)) + 
  geom_point() + theme_classic() + xlab("log2(FC)") + ylab("-log10(P)") +
  scale_color_manual(values=c("gray50", "dodgerblue4", "darkred")) +
  labs(color = NULL) + 
  annotate("text", x = -8, y=64, label = paste0("n=", nrow(resLFC_CLLRT2_db[resLFC_CLLRT2_db$Direction == "Down",])), col="dodgerblue4") +
  annotate("text", x=8, y=64, label= paste0("n=", nrow(resLFC_CLLRT2_db[resLFC_CLLRT2_db$Direction == "Up",])), col="darkred") + 
  theme(legend.position="top")

pdf("outputs/RT_volcano.pdf", width = 3, height = 3.8, useDingbats = F)
vp_CLLRT
dev.off()
## quartz_off_screen 
##                 2
vp_CLLRT

3.3 Heatmap

valsAll <- t(scale(t(vsd_CLLRT)))
colnames(valsAll) <- paste0(sampleTable$Case, " (", sampleTable$TimePoint, ")")
valsAll_DEG <- valsAll[rownames(valsAll) %in% rownames(resLFC_CLLRT2_db)[resLFC_CLLRT2_db$Direction!="NS"], ] 

sampleTable$UsedInDEA <- "Yes"
sampleTable$UsedInDEA[sampleTable$Case %in% c("19", "3299")] <- "No"

# define column order
colOrder <- c( "19 (T2)" , "3299 (T2)", "12 (T2)", "816 (T1)", "63 (T1)",  "365 (T1)",  "3299 (T4)", "19 (T6)", "12 (T6)", "816 (T3)","63 (T3)","365 (T3)")
valsAll_DEG <- valsAll_DEG[, match(colOrder, colnames(valsAll_DEG))]
sampleTableHeatmap <- sampleTable[match(colOrder, paste0(sampleTable$Case, " (", sampleTable$TimePoint, ")")),]

# heatmap annotation - columns
ha = HeatmapAnnotation(
   Diagnosis=sampleTableHeatmap$Diagnosis, IGHV=sampleTableHeatmap$IGHV, Tissue=sampleTableHeatmap$Tissue, DEA=sampleTableHeatmap$UsedInDEA,
    col = list(IGHV = c("M-IGHV"="#010101", "U-IGHV"="#B4B4B4"),
               Diagnosis = c("CLL"="#EDEDED", "RT"="#CE899A"),
               Tissue = c("Peripheral blood"="#FFDBB8", "Lymph node"="#CAF9E2"),
               DEA = c("Yes"="#FFF4B0", "No"="#F4F4F4")
               ),
    show_legend = T, border = T,
    show_annotation_name = T)

hm <- Heatmap(valsAll_DEG, 
             cluster_rows = T, show_row_dend = F, split = 2, show_row_names = FALSE,
             cluster_columns = F, 
             column_order = colOrder,
             column_split = c(1,1,2,2,2,2,3,3,4,4,4,4),
             top_annotation = ha, show_column_names = TRUE,
             row_title = c(paste0("n=", nrow(resLFC_CLLRT2_db[resLFC_CLLRT2_db$Direction == "Up",])),
                           paste0("n=", nrow(resLFC_CLLRT2_db[resLFC_CLLRT2_db$Direction == "Down",]))),
             row_title_rot = 0, 
             border="black", col = colorRamp2(c(-3, 0, 3), c("#343A8F", "white", "#D63027")),
             row_names_gp = gpar(fontsize = 4),
             heatmap_legend_param = list(title = "Gene z-scores")
             )

# heatmap annotation - rows
epi <- read.table("inputs_epigenetics/associated-regions-deg_epi_and_meth.txt", header = T, sep = "\t", stringsAsFactors = F)
epi <- epi[match(rownames(valsAll_DEG), epi$EnsemblGeneStableID, ),] 
table(epi$Direction.rnaseq)
## 
## Down   Up 
##  809 1439
# check overlap
table(epi$Direction.rnaseq, epi$Direction.methylation)
##       
##        Hyper Hypo   NS
##   Down     3   29  777
##   Up       4   57 1378
(3+29+4+57)/(3+29+777+4+57+1378)*100
## [1] 4.137011
table(epi$Direction.rnaseq, epi$Direction.H3K27ac)
##       
##        Decrease Increase Increase;Decrease   NS
##   Down      286       35                14  474
##   Up        225      173                12 1029
(286+173)/(286+35+14+474+225+173+12+1029)*100
## [1] 20.41815
table(epi$Direction.rnaseq, epi$Direction.atac)
##       
##        Decrease Increase   NS
##   Down       59      118  632
##   Up         40      294 1105
(59+294)/(59+118+632+40+294+1105)*100
## [1] 15.70285
# genes to highlight
genesToHighlight <- c(
  "MYCBP", # MYC up in 3 cases (add to boxplots)
  "WNT5A", "WNT3", "WNT16", "WNT10A", "WNT6",
  "TLR10", "TLR9", "TLR8",
  "TRAF5",
  "CXCR4",
  "MKI67", "PCNA", "TOP2A", 
  "CDKN1B",
  "CDK4", "CDK6", "CDK17","CDK1","CDK5", "CDKN3",
  "AICDA", # POLH LFC 0.88 pval < 0.01 (add to boxplots)
  "CCND2", # CCND3 in case 19 and 3299 (add to boxplots) 
  "HLA-A", "HLA-B",
  "ARID4A", "ARID4B", "ARID5B", "CREBBP", "EP300", "CHD7", "CHD2", "BAZ2A")
ha_row = rowAnnotation(foo = anno_mark(at = which(epi$HGNC.symbol %in% genesToHighlight), labels = epi$HGNC.symbol[which(epi$HGNC.symbol %in% genesToHighlight)], labels_gp = gpar(fontsize = 5)))

# list of heatmaps
ht_list = hm + 
  Heatmap(epi$Direction.rnaseq, name = "RNA-seq", width = unit(0.1, "cm"), border = T, col = c("Down"="#343A8F", "Up"="#D63027", "NS"="white")) +
  Heatmap(epi$Direction.methylation, name = "DNA meth.", width = unit(0.5, "cm"), border = T, col = c("Hypo"="#34398E", "Hyper"="#A61829", "NS"="white")) +
  Heatmap(epi$Direction.H3K27ac, name = "H3K27ac", width = unit(0.5, "cm"), border = T, col = c("Decrease"="#0B652E", "Increase"="#F6A318", "Increase;Decrease"="gray70", "NS"="white")) +
  Heatmap(epi$Direction.atac, name = "ATAC-seq", width = unit(0.5, "cm"), border = T, col = c("Decrease"="#0B652E", "Increase"="#F6A318", "NS"="white"), right_annotation = ha_row)

# plot
pdf("outputs/RT_heatmap.pdf", width = 9.5, height = 8, useDingbats = F)
ht_list
dev.off()
## quartz_off_screen 
##                 2
ht_list

3.4 Boxplots

# normalized counts
normalizedCounts <- counts(dds_CLLRT, normalized=TRUE)
colnames(normalizedCounts) <- dds_CLLRT$Sample

# keep only genes kept in DEA table
normalizedCounts <- normalizedCounts[rownames(normalizedCounts) %in% rownames(resLFC_CLLRT2_db),]
normalizedCounts <- data.frame(t(normalizedCounts), stringsAsFactors = F)

# add metadata
normalizedCounts$Case <- sampleTable$Case
normalizedCounts$Sample <- sampleTable$Sample
normalizedCounts$Diagnosis <- sampleTable$Diagnosis
normalizedCounts$IGHV <- sampleTable$IGHV
normalizedCounts$Tissue <- sampleTable$Tissue

# order
normalizedCounts <- normalizedCounts[,c(19555:19559, 1:19554)]

# by HGNC.symbol
normalizedCountsHGNC.symbol <- normalizedCounts
colnames(normalizedCountsHGNC.symbol)[6:ncol(normalizedCountsHGNC.symbol)] <- resLFC_CLLRT2_db$HGNC.symbol[match(colnames(normalizedCountsHGNC.symbol)[6:ncol(normalizedCountsHGNC.symbol)], rownames(resLFC_CLLRT2_db))]

# save
write.table(normalizedCounts, "outputs/normalizedCounts_ensembl.tsv", row.names = F, col.names = T, sep = "\t", quote = F)
write.table(normalizedCountsHGNC.symbol, "outputs/normalizedCounts_HGNCsymbol.tsv", row.names = F, col.names = T, sep = "\t", quote = F)
# selected genes based on DEA
genes <- c("MKI67", "CDK1", "CDK4","CDK5","CDK6","CDKN3","CDK17",
           "WNT3", "WNT5A", "WNT6", "WNT16", "TLR8", "TLR9", "TLR10", 
           "AICDA", "POLH", "MYC", "MYCBP", "CCND2", "CCND3")

mDB <- melt(normalizedCountsHGNC.symbol, id.vars = c("Case", "Sample", "Diagnosis", "IGHV", "Tissue"))
mDBmini <- mDB[mDB$variable %in% genes,]
mDBmini$variable <- factor(mDBmini$variable, levels = genes)
mDBmini$value <- mDBmini$value/1000
a <- ggplot(mDBmini, aes(Diagnosis, value, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression (normalized counts, x1000)") + xlab(NULL) +
  theme(legend.position="top") +
  geom_text_repel(data = mDBmini[mDBmini$Diagnosis=="RT",], aes(label = Case), size=3)+
  facet_wrap(~ variable, scales = "free", nrow=3)

pdf("outputs/RT_boxplots.pdf", width = 6*1.2, height = 4.2*1.2, useDingbats = F)
a
dev.off()
## quartz_off_screen 
##                 2
a

# selected driver genes based on genomic alterations
MYCN <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, MYCN/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("MYCN") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==816,], aes(label = Case), size=3)

CDKN1B <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, CDKN1B/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("CDKN1B") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==19,], aes(label = Case), size=3) +
  stat_compare_means(method="t.test")

ARID4B <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, ARID4B/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("ARID4B") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==816,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

ARID1A <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, ARID1A/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("ARID1A") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case %in% c(63, 365),], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

CHD2 <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, CHD2/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("CHD2") +
  theme(legend.position="top") +
  stat_compare_means(method="t.test")

EP300 <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, EP300/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("EP300") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==12,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

CREBBP <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, CREBBP/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("CREBBP") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==12,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

TRAF3 <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, TRAF3/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("TRAF3") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==12,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

SPEN <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, SPEN/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("SPEN") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==12,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

TNFRSF14 <- ggplot(normalizedCountsHGNC.symbol, aes(Diagnosis, TNFRSF14/1000, fill=Diagnosis)) +
  geom_boxplot(alpha=0.25, outlier.shape = NA) + 
  geom_jitter(width = 0.2, height = 0, pch=21, size=2) + 
  theme_classic() +
  scale_fill_manual(values=c("#EDEDED", "#CE899A")) +
  ylab("Expression\n(norm. counts, x1000)") + xlab(NULL) + ggtitle("TNFRSF14") +
  theme(legend.position="top") +
  geom_text_repel(data = normalizedCountsHGNC.symbol[normalizedCountsHGNC.symbol$Case==12,], aes(label = Case), size=3)+
  stat_compare_means(method="t.test")

pdf("outputs/RT_boxplots_drivers.pdf", width = 10, height = 6, useDingbats = F)
grid.arrange(MYCN, CDKN1B, ARID4B, ARID1A, CHD2, EP300, CREBBP, TRAF3, SPEN, TNFRSF14, nrow=2)
dev.off()
## quartz_off_screen 
##                 2
grid.arrange(MYCN, CDKN1B, ARID4B, ARID1A, CHD2, EP300, CREBBP, TRAF3, SPEN, TNFRSF14, nrow=2)

4 GSEA

# pre-ranked gene list ordered by -log10(pvalue) x (sign of fold change)
gsea_res <- resLFC_CLLRT2_db[!is.na(resLFC_CLLRT2_db$pvalue),]
gsea_res$score <- -log10(gsea_res$pvalue) * ifelse(gsea_res$log2FoldChange>0, 1, -1)
ranks <- gsea_res[,c("HGNC.symbol", "score")] 
ranks <- ranks[order(ranks$score, decreasing = T), ]
ranks <- setNames(ranks$score, ranks$HGNC.symbol)

4.1 Hallmarks

t2g <- read.gmt(paste0(pathToData, "MSigDB/h.all.v7.4.symbols.gmt"))
gseaH <- GSEA(sort(ranks, decreasing = T), TERM2GENE=t2g, verbose=F, nPerm = 10000, minGSSize = min_gs_size, maxGSSize = max_gs_size, seed = T)
gseaH_res <- gseaH@result
rownames(gseaH_res) <- NULL
gseaH_res <- gseaH_res[order(gseaH_res$p.adjust, decreasing = F),]
gseaH_res <- gseaH_res[order(abs(gseaH_res$NES), decreasing = T),]

write.table(gseaH_res, "outputs/RT_gsea_H.tsv", col.names = T, row.names = F, sep = "\t", quote = F)
DT::datatable(gseaH_res[,2:10], options = list(scrollX = T, scrollY=T), rownames = F)

4.2 C2.cp

t2g <- read.gmt(paste0(pathToData, "MSigDB/c2.cp.v7.4.symbols.gmt"))
gseaC2 <- GSEA(sort(ranks, decreasing = T), TERM2GENE=t2g, verbose=F, nPerm = 10000, minGSSize = min_gs_size, maxGSSize = max_gs_size, seed = T)
gseaC2_res <- gseaC2@result
rownames(gseaC2_res) <- NULL
gseaC2_res <- gseaC2_res[order(gseaC2_res$p.adjust, decreasing = F),]
gseaC2_res <- gseaC2_res[order(abs(gseaC2_res$NES), decreasing = T),]

write.table(gseaC2_res, "outputs/RT_gsea_C2cp.tsv", col.names = T, row.names = F, sep = "\t", quote = F)
DT::datatable(gseaC2_res[,2:10], options = list(scrollX = T, scrollY=T), rownames = F)

4.3 GO

gseGO_res <- gseGO(ranks, ont = "BP", OrgDb = org.Hs.eg.db, keyType = "SYMBOL", nPerm = 10000, minGSSize = min_gs_size, maxGSSize = max_gs_size, seed = T)
gseGO_Sim <- clusterProfiler::simplify(gseGO_res, cutoff = 0.35)
gseGO_SimRes <- gseGO_Sim@result
rownames(gseGO_SimRes) <- NULL
gseGO_SimRes <- gseGO_SimRes[order(gseGO_SimRes$p.adjust, decreasing = F),]
gseGO_SimRes <- gseGO_SimRes[order(abs(gseGO_SimRes$NES), decreasing = T),]

write.table(gseGO_SimRes, "outputs/RT_gsea_GO.tsv", col.names = T, row.names = F, sep = "\t", quote = F)
DT::datatable(gseGO_SimRes[,1:10], options = list(scrollX = T, scrollY=T), rownames = F)

4.4 Plots

gsea_bp <- rbind(gseaH_res[gseaH_res$ID == "HALLMARK_E2F_TARGETS",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_G2M_CHECKPOINT",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_MYC_TARGETS_V1",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_MTORC1_SIGNALING",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_OXIDATIVE_PHOSPHORYLATION",],
                 gseaC2_res[gseaC2_res$ID == "REACTOME_MITOCHONDRIAL_TRANSLATION",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_GLYCOLYSIS",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY",],
                 gseaH_res[gseaH_res$ID == "HALLMARK_DNA_REPAIR",],
                 gseaC2_res[gseaC2_res$ID == "PID_BCR_5PATHWAY",])

gsea_bp$ID <- gsub("HALLMARK_", "", gsea_bp$ID)
gsea_bp$ID <- gsub("OXIDATIVE_PHOSPHORYLATION", "OXPHOS", gsea_bp$ID)
gsea_bp$ID <- gsub("REACTOME_MITOCHONDRIAL_TRANSLATION", "MITOCHONDRIAL_TRANSLATION", gsea_bp$ID)
gsea_bp$ID <- gsub("REACTIVE_OXYGEN_SPECIES", "ROS_", gsea_bp$ID)
gsea_bp$ID <- gsub("PID_BCR_5PATHWAY", "BCR_PATHWAY", gsea_bp$ID)
gsea_bp$ID <- gsub("_V1", "", gsea_bp$ID)

gsea_bp$ID <- gsub("_", " ", gsea_bp$ID)

gsea_bp$ID <- factor(gsea_bp$ID, levels = gsea_bp$ID)
gsea_bp$Direction <- "Up"
gsea_bp$Direction[gsea_bp$NES<0] <- "Down"

bp <- ggplot(gsea_bp,aes(x=ID, y=NES, fill=Direction)) +
  geom_bar(stat = "identity", color="black", width = 0.75) +  
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  geom_hline(yintercept = 0) + 
  ylab("NES") + xlab(NULL) + 
  scale_fill_manual(values=c("#8882BA", "#EF7F6A")) + theme(legend.position = "none")

pdf("outputs/RT_GSEA_bp.pdf", height = 3, width = 4, useDingbats = F)
bp
dev.off()
## quartz_off_screen 
##                 2
bp

g1 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_E2F_TARGETS", geneSetID = "HALLMARK_E2F_TARGETS")
g2 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_G2M_CHECKPOINT", geneSetID = "HALLMARK_G2M_CHECKPOINT")
g3 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_MYC_TARGETS_V1", geneSetID = "HALLMARK_MYC_TARGETS_V1")
g4 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_MYC_TARGETS_V2", geneSetID = "HALLMARK_MYC_TARGETS_V2")
g5 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_MTORC1_SIGNALING", geneSetID = "HALLMARK_MTORC1_SIGNALING")
g6 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_OXIDATIVE_PHOSPHORYLATION", geneSetID = "HALLMARK_OXIDATIVE_PHOSPHORYLATION")
g7 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_GLYCOLYSIS", geneSetID = "HALLMARK_GLYCOLYSIS")
g8 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY", geneSetID = "HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY")
g9 <- gseaplot(gseaH, by = "runningScore", title = "HALLMARK_DNA_REPAIR", geneSetID = "HALLMARK_DNA_REPAIR")
g10 <- gseaplot(gseaC2, geneSetID = "PID_BCR_5PATHWAY", by = "runningScore", title = "PID_BCR_5PATHWAY")

pdf("outputs/RT_GSEA_enrichments_main.pdf", width = 3, height = 4, useDingbats = F)
grid.arrange(g6,g10, nrow=2)
dev.off()
## quartz_off_screen 
##                 2
grid.arrange(g6,g10, nrow=2)

lay <- matrix(c(1,2,3,4,5,6,7,8), ncol=4)
pdf("outputs/RT_GSEA_enrichments_supple.pdf", width = 16, height = 6, useDingbats = F)
grid.arrange(arrangeGrob(g1,g2,g3,g4,g5,g7,g8,g9, layout_matrix = lay))
dev.off()
## quartz_off_screen 
##                 2
grid.arrange(arrangeGrob(g1,g2,g3,g4,g5,g7,g8,g9, layout_matrix = lay))

5 Session

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0                circlize_0.4.10            
##  [3] clusterProfiler_3.14.3      ComplexHeatmap_2.2.0       
##  [5] gridExtra_2.3               reshape2_1.4.4             
##  [7] ggrepel_0.8.2               ggplot2_3.3.5              
##  [9] org.Hs.eg.db_3.10.0         AnnotationDbi_1.50.3       
## [11] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [13] DelayedArray_0.12.3         BiocParallel_1.20.1        
## [15] matrixStats_0.56.0          Biobase_2.46.0             
## [17] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [19] IRanges_2.22.2              S4Vectors_0.26.1           
## [21] BiocGenerics_0.32.0         tximport_1.14.2            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.9        Hmisc_4.4-1           
##   [4] fastmatch_1.1-0        plyr_1.8.6             igraph_1.2.6          
##   [7] splines_3.6.3          crosstalk_1.1.0.1      urltools_1.7.3        
##  [10] digest_0.6.25          htmltools_0.5.1.1      GOSemSim_2.12.1       
##  [13] viridis_0.5.1          GO.db_3.10.0           fansi_0.4.1           
##  [16] magrittr_2.0.1         checkmate_2.0.0        memoise_1.1.0         
##  [19] cluster_2.1.0          openxlsx_4.1.5         readr_1.4.0           
##  [22] annotate_1.64.0        graphlayouts_0.7.1     bdsmatrix_1.3-4       
##  [25] enrichplot_1.6.1       prettyunits_1.1.1      jpeg_0.1-8.1          
##  [28] colorspace_1.4-1       apeglm_1.8.0           blob_1.2.1            
##  [31] haven_2.3.1            xfun_0.21              dplyr_1.0.5           
##  [34] crayon_1.4.1           RCurl_1.98-1.2         jsonlite_1.7.2        
##  [37] genefilter_1.68.0      survival_3.2-3         glue_1.4.2            
##  [40] polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.32.0       
##  [43] XVector_0.28.0         GetoptLong_1.0.2       car_3.0-9             
##  [46] shape_1.4.4            abind_1.4-5            scales_1.1.1          
##  [49] DOSE_3.12.0            mvtnorm_1.1-1          DBI_1.1.0             
##  [52] rstatix_0.6.0          Rcpp_1.0.5             emdbook_1.3.12        
##  [55] viridisLite_0.3.0      xtable_1.8-4           progress_1.2.2        
##  [58] htmlTable_2.0.1        clue_0.3-57            gridGraphics_0.5-1    
##  [61] foreign_0.8-75         bit_4.0.4              europepmc_0.4         
##  [64] Formula_1.2-3          DT_0.15                htmlwidgets_1.5.3     
##  [67] httr_1.4.2             fgsea_1.12.0           RColorBrewer_1.1-2    
##  [70] ellipsis_0.3.1         pkgconfig_2.0.3        XML_3.99-0            
##  [73] farver_2.0.3           nnet_7.3-14            sass_0.3.1            
##  [76] locfit_1.5-9.4         utf8_1.1.4             labeling_0.3          
##  [79] ggplotify_0.0.7        tidyselect_1.1.0       rlang_0.4.10          
##  [82] cellranger_1.1.0       munsell_0.5.0          tools_3.6.3           
##  [85] generics_0.0.2         RSQLite_2.2.0          broom_0.7.6           
##  [88] ggridges_0.5.2         evaluate_0.14          stringr_1.4.0         
##  [91] yaml_2.2.1             knitr_1.29             bit64_4.0.5           
##  [94] tidygraph_1.2.0        zip_2.1.1              purrr_0.3.4           
##  [97] ggraph_2.0.5           DO.db_2.9              xml2_1.3.2            
## [100] compiler_3.6.3         rstudioapi_0.13        curl_4.3              
## [103] png_0.1-7              ggsignif_0.6.0         tibble_3.1.1          
## [106] tweenr_1.0.2           geneplotter_1.64.0     bslib_0.2.4           
## [109] stringi_1.4.6          forcats_0.5.1          lattice_0.20-41       
## [112] Matrix_1.2-18          vctrs_0.3.7            pillar_1.6.0          
## [115] lifecycle_1.0.0        BiocManager_1.30.10    triebeard_0.3.0       
## [118] jquerylib_0.1.3        GlobalOptions_0.1.2    data.table_1.12.8     
## [121] cowplot_1.0.0          bitops_1.0-6           qvalue_2.18.0         
## [124] R6_2.4.1               latticeExtra_0.6-29    rio_0.5.16            
## [127] MASS_7.3-52            assertthat_0.2.1       rjson_0.2.20          
## [130] withr_2.4.2            GenomeInfoDbData_1.2.2 hms_1.0.0             
## [133] rpart_4.1-15           coda_0.19-4            tidyr_1.1.3           
## [136] rvcheck_0.1.8          rmarkdown_2.7          carData_3.0-4         
## [139] bbmle_1.0.23.1         ggforce_0.3.3          numDeriv_2016.8-1.1   
## [142] base64enc_0.1-3